#import required packages
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, max_error
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("Current__2021-2022__Assessment_Roll.csv", low_memory=False)
df.head(10)
| SBL | TAX DISTRICT | PRINT KEY | FRONT | DEPTH | PROP CLASS DESCRIPTION | PREVIOUS PROPERTY CLASS | OWNER1 | OWNER2 | PREVIOUS OWNER | ... | SPECIAL DISTRICT CODE | STORY HEIGHT | SWIS | USED AS CODE | WALL A | WALL B | WALL C | MAILZIP5 | MAILZIP4 | PROPERTY CLASS CODE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0785000008017900 | 147009 | 78.50-8-17.9 | 65.00 | 30.0 | RESIDENTIAL VACANT LAND | 311.0 | CHANDRASEKHARAN PRAVEEN K | RAWAT MUNMUN | NaN | ... | SE | 0 | 147009 | NaN | 0 | 0 | 0 | 14216 | NaN | 311.0 |
| 1 | 6660000065023000 | 147001 | 666.00-65-23 | 0.00 | 0.0 | TELEPHONE | 831.0 | CENTURY LINK | NaN | NaN | ... | SE | 0 | 147001 | NaN | 0 | 0 | 0 | 80021 | NaN | 831.0 |
| 2 | 5550000091007000 | 147001 | 555.00-91-7 | 0.00 | 0.0 | TELEPHONE - SPECIAL FRANCHISE | 866.0 | TVC ALBANY INC | NaN | NaN | ... | SE | 0 | 147001 | NaN | 0 | 0 | 0 | 12207 | NaN | 866.0 |
| 3 | 1236900001016000 | 147013 | 123.69-1-16 | 30.00 | 109.0 | RESIDENTIAL VACANT LAND | 311.0 | CITY OF BUFFALO | PERFECTING TITLE | NaN | ... | SE | 0 | 147013 | NaN | 0 | 0 | 0 | 14202 | NaN | 311.0 |
| 4 | 100390000103200000 | 147006 | 100.39-1-32./83 | 1.00 | 215.0 | COM VAC W/IMP | 331.0 | DELAWARE TOWERS | ASSOCIATES INC | NaN | ... | SE | 0 | 147006 | NaN | 0 | 0 | 0 | 14222 | 1618 | 331.0 |
| 5 | 0907700003012000 | 147005 | 90.77-3-12 | 54.28 | 117.0 | RESIDENTIAL VACANT LAND | 311.0 | WALKER ETHEL L | NaN | NaN | ... | SE | 0 | 147005 | NaN | 0 | 0 | 0 | 14215 | NaN | 311.0 |
| 6 | 1225100002013200 | 147013 | 122.51-2-13.2 | 15.00 | 173.0 | RESIDENTIAL VACANT LAND | 311.0 | PIKUL MARY | NaN | NaN | ... | SE | 0 | 147013 | NaN | 0 | 0 | 0 | 14210 | NaN | 311.0 |
| 7 | 1112400002032000 | 147003 | 111.24-2-32 | 30.00 | 100.0 | RESIDENTIAL VACANT LAND | 311.0 | GIBSON CLARENCE | NaN | NaN | ... | SE | 0 | 147003 | NaN | 0 | 0 | 0 | 14205 | 0633 | 311.0 |
| 8 | 1113900005003000 | 147003 | 111.39-5-3 | 29.82 | 0.0 | COM VAC W/IMP | 331.0 | QUERMBACK ELECTRIC INC | NaN | NaN | ... | SE | 0 | 147003 | J03 | 0 | 0 | 0 | 14203 | NaN | 331.0 |
| 9 | 1114600010009000 | 147003 | 111.46-10-9 | 26.00 | 110.0 | OFFICE BUILDING | 464.0 | VALLEDOLMO LLC | NaN | KEATING, KENT G | ... | SE | 10 | 147003 | NaN | 0 | 100 | 0 | 14202 | NaN | 464.0 |
10 rows × 85 columns
# number of rows
len(df)
93653
#get the number of null values in each of the columns
null_vals = df.isna().sum()
## drop rows using Nandita Algo from cell #7
cols_to_drop = null_vals[null_vals.values > 74922].index
df.drop(labels = cols_to_drop, axis = 1, inplace = True)
# for columns having missing values in less than 20% of the rows,
# drop the rows with the missing value in those columns.
results = null_vals[null_vals.values <= 74922]
cols_rows_to_drop = results[results.values <= 18730].index
df.dropna(axis = 0, subset= cols_rows_to_drop, inplace= True)
#median/mode imputation
def impute(X, cols):
X = X.fillna(value=0)
for c in cols:
#if the column is of type object, ie string, replace missing value with the mode
if X[c].dtypes == object:
X[c].fillna(value = X[c].mode().values[0], inplace = True)
#if the column is of type float, replace missing value with the median
elif X[c].dtypes == float:
X[c].fillna(value = X[c].median(), inplace = True)
return X
cols_to_fill = results[results.values <= 18730].index
df = impute(df, cols_to_fill)
## drop columns where ZIP CODE(5 digits) is missing as there is not property without valid zip code
df = df.dropna(subset=["ZIP CODE (5-DIGIT)"])
# remove duplicates
df = df.drop_duplicates()
# convert some columns to int
df["ZIP CODE (5-DIGIT)"] = df["ZIP CODE (5-DIGIT)"].astype(int)
df["OVERALL CONDITION"] = df["OVERALL CONDITION"].astype(int)
df["# OF KITCHENS"] = df["# OF KITCHENS"].astype(int)
df["# OF BATHS"] = df["# OF BATHS"].apply(np.ceil)
df["# OF BATHS"] = df["# OF BATHS"].astype(int)
df["PREVIOUS PROPERTY CLASS"] = df["PREVIOUS PROPERTY CLASS"].astype(int)
df["YEAR BUILT"] = df["YEAR BUILT"].astype(int)
df["BASEMENT TYPE"] = df["BASEMENT TYPE"].astype(int)
#df["BUILDING STYLE"] = df["BUILDING STYLE"].astype(int)
df["HEAT TYPE"] = df["HEAT TYPE"].astype(int)
# number rows cleaned dataset
len(df)
89457
df.to_csv('cleaned_dataset.csv', index=False)
df = pd.read_csv("cleaned_dataset.csv")
/Users/makhtarsylla/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (72,73) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
import plotly.express as px
fig = px.histogram(df,
x="ZIP CODE (5-DIGIT)",
labels={"ZIP CODE (5-DIGIT)": "ZIP CODE", "count": "NUMBER OF PROPERTIES"})
fig.update_layout(
title_text='Distribution of Property based on ZIP Code',
xaxis_title_text='ZIP Code',
yaxis_title_text='Number of Properties',
bargap=0.2,
yaxis_tickformat = 'digits',
xaxis_tickformat = 'digits',
xaxis_dtick = 1
)
fig.show()
neighborhood_values = pd.DataFrame(df['NEIGHBORHOOD'].value_counts())
neighborhood_values.reset_index(inplace=True)
neighborhood_values.columns = ["Neighborhood", "Number of Properties"]
px.bar(neighborhood_values, x = "Neighborhood", y = "Number of Properties",
title = "Number of properties in each neighboorhood")
property_count_per_class = df['PROP CLASS DESCRIPTION'].value_counts()
prop_class_labels = list(property_count_per_class.index)
prop_class_counts = list(property_count_per_class.values)
others_count = 0
other_labels = []
class_counts_list = []
class_labels_list = []
for index, count in enumerate(prop_class_counts):
if count < 450:
others_count += count
other_labels.append(prop_class_labels[index])
else:
class_counts_list.append(count)
class_labels_list.append(prop_class_labels[index])
class_labels_list.append("OTHERS")
class_counts_list.append(others_count)
fig=px.pie(values=class_counts_list, names=class_labels_list, title="Distribution of Property type")
fig.show()
one_two_family = df.loc[df["PROP CLASS DESCRIPTION"].isin(["ONE FAMILY DWELLING", "TWO FAMILY DWELLING"])]
one_two_family = pd.DataFrame(one_two_family.groupby(["ZIP CODE (5-DIGIT)", "PROP CLASS DESCRIPTION"])["TOTAL VALUE"].aggregate(func="mean")).reset_index()
one_two_family["AVERAGE PRICE"] = one_two_family["TOTAL VALUE"]
plt.figure(figsize=(14, 5))
sns.barplot(data=one_two_family,
x="ZIP CODE (5-DIGIT)",
y="AVERAGE PRICE",
hue="PROP CLASS DESCRIPTION"
)
plt.title("AVERAGE ONE AND TWO FAMILY HOUSE VALUE")
plt.show()
neighborhood_avgs = pd.DataFrame(df.groupby(by = 'NEIGHBORHOOD')["TOTAL VALUE"].aggregate(func="mean"))
neighborhood_avgs.reset_index(inplace=True)
neighborhood_avgs["AVERAGE PRICE"] = neighborhood_avgs["TOTAL VALUE"]
px.bar(neighborhood_avgs, x = "NEIGHBORHOOD", y = "AVERAGE PRICE")
# compute Average price of commercial properties
apartment_df = df[df['PROP CLASS DESCRIPTION'].str.contains("APARTMENT")]
dwelling = df[df['PROP CLASS DESCRIPTION'].str.contains("DWELLING")]
land = df[df['PROP CLASS DESCRIPTION'].str.contains("LAND")]
religious = df[df['PROP CLASS DESCRIPTION'].str.contains("RELIGIOUS")]
commercial = df[~df.SBL.isin(dwelling.SBL)]
commercial = commercial[~df.SBL.isin(apartment_df.SBL)]
commercial = commercial[~df.SBL.isin(land.SBL)]
commercial = commercial[~df.SBL.isin(religious.SBL)]
commercial = pd.DataFrame(commercial.groupby('NEIGHBORHOOD')["TOTAL VALUE"].aggregate(func="mean")).reset_index()
commercial["AVERAGE PRICE"] = commercial["TOTAL VALUE"]
px.bar(commercial, x="NEIGHBORHOOD", y="AVERAGE PRICE", title="Average commercial property value in each neighborhood")
<ipython-input-21-27f9550ffff1>:8: UserWarning: Boolean Series key will be reindexed to match DataFrame index. <ipython-input-21-27f9550ffff1>:9: UserWarning: Boolean Series key will be reindexed to match DataFrame index. <ipython-input-21-27f9550ffff1>:10: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
# load cleaned dataset
df = pd.read_csv("cleaned_dataset.csv", low_memory=False)
# get the X and Y dataframes
cols = list(df.columns)
cols.remove("TOTAL VALUE")
X = df[cols]
Y = df["TOTAL VALUE"]
corr = pd.DataFrame(X.corrwith(Y))
corr.reset_index(inplace=True)
corr.columns = ["COLUMN", "CORRELATION"]
corr.sort_values(by = "CORRELATION", axis = 0, ascending=False, inplace=True)
# show 10 random rows
corr.sample(10)
| COLUMN | CORRELATION | |
|---|---|---|
| 36 | CONSTRUCTION QUALITY CODE | 0.190391 |
| 6 | DEED BOOK | -0.053710 |
| 34 | CENTRAL AIR | 0.007149 |
| 0 | TAX DISTRICT | -0.046548 |
| 14 | SECOND STORY AREA | -0.004746 |
| 17 | HEAT TYPE | -0.054027 |
| 22 | # OF BATHS | -0.030705 |
| 1 | FRONT | 0.166180 |
| 38 | STORY HEIGHT | 0.208753 |
| 15 | TOTAL LIVING AREA | -0.029613 |
px.bar(corr, x = "COLUMN", y = "CORRELATION", height= 1000, range_y = [-0.15,0.5],
title = "Correlation of each feature with the response variable, Total Value")
df = pd.read_csv("cleaned_dataset.csv")
/Users/makhtarsylla/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (72,73) have mixed types.Specify dtype option on import or set low_memory=False.
# We noticed that few properties have a total value greater than 9 million,
#so we drop the rows with such total value as they constitute a noise or outliers in the dataset.
df = df[df["TOTAL VALUE"] < 900000]
# feature selection result
columns = ['TAX DISTRICT', 'FRONT', 'DEPTH',
'PREVIOUS PROPERTY CLASS', 'ROLL',
'LAND VALUE', 'YEAR BUILT','ZIP CODE (5-DIGIT)',
'TOTAL LIVING AREA', 'OVERALL CONDITION', 'HEAT TYPE',
'BASEMENT TYPE', '# OF FIREPLACES', '# OF BEDS', '# OF BATHS',
'# OF KITCHENS', 'LATITUDE', 'LONGITUDE']
x = np.asarray(df[columns])
y = np.asarray(df['TOTAL VALUE'])
df = df.select_dtypes(include=["float64", "int64"])
# split data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.9,random_state=2)
linearRegressionModel = LinearRegression().fit(x_train, y_train)
# train set score
linearRegressionModel.score(x_train, y_train)
0.6985081038532917
# test set score
linearRegressionModel.score(x_test, y_test)
0.7083762085198404
# mean squared error
mean_squared_error(y_test, linearRegressionModel.predict(x_test))
4145210684.005554
# maximum error
max_error(y_test, linearRegressionModel.predict(x_test))
645922.5619183481
fig=plt.figure()
df_viz = df[columns]
ax=fig.add_axes([0,0,1,1])
ax.scatter(y_test, linearRegressionModel.predict(x_test), color='r')
x_y = np.linspace(0,np.max(y_test),2)
plt.plot(x_y, x_y, linestyle='--', color='k')
plt.legend(["Optimal Line" , "Prediction"])
fig.show()
<ipython-input-40-8f54c360adf1>:8: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
lassoRgrModel = Lasso(normalize=True).fit(x_train, y_train)
# train set score
lassoRgrModel.score(x_train, y_train)
0.6977263764713193
# test set score
lassoRgrModel.score(x_test, y_test)
0.707662485514645
# mean squared error
mean_squared_error(y_test, lassoRgrModel.predict(x_test))
4155355714.3939877
# maximum error
max_error(y_test, lassoRgrModel.predict(x_test))
646899.9104899764
fig=plt.figure()
df_viz = df[columns]
ax=fig.add_axes([0,0,1,1])
ax.scatter(y_test, lassoRgrModel.predict(x_test), color='y')
x_y = np.linspace(0,np.max(y_test),2)
plt.plot(x_y, x_y, linestyle='--', color='k')
plt.legend(["Optimal Line" , "Prediction"])
fig.show()
<ipython-input-46-820a55acb0dc>:8: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.
randomForestRgr = RandomForestRegressor(n_estimators=100).fit(x_train, y_train)
# train set score
randomForestRgr.score(x_train, y_train)
# test set score
randomForestRgr.score(x_test, y_test)
# mean squared error
mean_squared_error(y_test, randomForestRgr.predict(x_test))
# maximum error
max_error(y_test, randomForestRgr.predict(x_test))
fig=plt.figure()
df_viz = df[columns]
ax=fig.add_axes([0,0,1,1])
ax.scatter(y_test, randomForestRgr.predict(x_test), color='g')
x_y = np.linspace(0,np.max(y_test),2)
plt.plot(x_y, x_y, linestyle='--', color='k')
plt.legend(["Optimal Line" , "Prediction"])
fig.show()
ridgeRgr = Ridge(max_iter=1000).fit(x_train, y_train)
# train set score
ridgeRgr.score(x_train, y_train)
# test set score
ridgeRgr.score(x_test, y_test)
# mean squared error
mean_squared_error(y_test, ridgeRgr.predict(x_test))
# maximum error
max_error(y_test, ridgeRgr.predict(x_test))
fig=plt.figure()
df_viz = df[columns]
ax=fig.add_axes([0,0,1,1])
ax.scatter(y_test, randomForestRgr.predict(x_test), color='b')
x_y = np.linspace(0,np.max(y_test),2)
plt.plot(x_y, x_y, linestyle='--', color='k')
plt.legend(["Optimal Line" , "Prediction"])
fig.show()
##!jupyter nbconvert EAS508_Project.ipynb --to html